package ‘dplyr’ was built under R version 3.3.2package ‘knitr’ was built under R version 3.3.2package ‘tibble’ was built under R version 3.3.2package ‘readxl’ was built under R version 3.3.2package ‘MASS’ was built under R version 3.3.2package ‘cowplot’ was built under R version 3.3.2package ‘robustbase’ was built under R version 3.3.2package ‘car’ was built under R version 3.3.2package ‘readr’ was built under R version 3.3.2
Perform basic data checks on Rwanda round soil health study data to provide real time feedback to Nathaniel and enumeration teams to maximize the value of the teams in the field.
load("../rw_round_1/fieldDat_final.Rdata")
Load data and make variable names nice. ytwo is the complete data. yt is the data I’ll be manipulating.
dataDir <- normalizePath(file.path("..", "..", "data"))
source("../oaflib/commcareExport.R")
package ‘curl’ was built under R version 3.3.2package ‘jsonlite’ was built under R version 3.3.2package ‘stringi’ was built under R version 3.3.2package ‘stringr’ was built under R version 3.3.2package ‘sp’ was built under R version 3.3.2package ‘leaflet’ was built under R version 3.3.2
source("../oaflib/misc.R")
#source("../oaflib/ccSluthing.R")
ytwo <- getFormData("oafrwanda", "M&E", "17B Ubutaka (Soil)", forceUpdate=T)
[1] "querying commcare-export for 98d142ec1149d0098f4e72ce0a3570600ba53dd4"
[1] "commcare-export --project oafrwanda --query /var/folders/pw/0l8d_x690m7b71_dmx2ww6fw0000gn/T//RtmpBUzOTP/filec4737b7f658c.json --output-format csv --output /var/folders/pw/0l8d_x690m7b71_dmx2ww6fw0000gn/T//RtmpBUzOTP/filec47368e409ab.zip --username robert.on@oneacrefund.org --password 0n3@cr3"
[1] "finished querying commcare-export for 98d142ec1149d0098f4e72ce0a3570600ba53dd4"
names(ytwo) <- tolower(make.names(names(ytwo)))
yt <- ytwo %>%
setNames(gsub("x.form.", "", names(.))) %>%
mutate(sample_id = tolower(sample_id))
package ‘bindrcpp’ was built under R version 3.3.2
Reality check some observations. Are they completely missing?
blankRow <- apply(yt, 1, function(x){
sum(x=="" | is.na(x))
})
naRow <- data.frame(table(round(blankRow/dim(yt)[2],2)*100)) %>%
mutate(Var1 = as.numeric(as.character(Var1))) %>%
filter(Var1>=25) %>%
mutate(Var1 = paste0(Var1, "%"))
kable(naRow, col.names = c("% of columns missing values", "Freq"),format="markdown")
| % of columns missing values | Freq |
|---|---|
| 25% | 124 |
| 26% | 62 |
| 27% | 166 |
| 28% | 35 |
| 29% | 46 |
| 30% | 51 |
| 31% | 38 |
| 32% | 23 |
| 33% | 26 |
| 34% | 24 |
| 35% | 30 |
| 36% | 12 |
| 37% | 8 |
| 38% | 1 |
| 39% | 2 |
| 42% | 29 |
| 43% | 27 |
| 44% | 1 |
| 45% | 2 |
| 74% | 5 |
| 75% | 11 |
| 76% | 24 |
| 77% | 8 |
| 78% | 12 |
| 79% | 1 |
# identify which rows have the most missing values
mostMissing <- ifelse(blankRow > 80, TRUE, FALSE)
missingTab <- yt %>%
filter(mostMissing==T) %>%
dplyr::select(district, enumerator_in_16b, village, sample_id, name)
kable(missingTab, format='markdown', caption="Obs with over 80 missing values")
| district | enumerator_in_16b | village | sample_id | name |
|---|---|---|---|---|
| Gatsibo_Nlwh | nshimiyerugira | Icyerekezo | 2632 | Bajeneza JD'Amaur |
| Rwamagana | Niyigena Jean Pierre | Nyarugenge | 2875c | Benimana Domithira |
| Rwamagana | Utamuriza Jeanne | Nyirabujari | 2938 | 11Kayiranga Jean Bosco |
| Karongi | NA | Gihumo | 1380 | Mudakikwa Elias |
| Karongi | yankurije ange | Ngoma | 1338c | Mukarwego Jakerine |
| Kayonza | Hagenimana bienvenue | Kibimba | 3092c | Munyankusi Enoki |
| Kayonza | Hagenimana bienvenue | Nyobugogo | 3098c | Nyirahabimana Florance |
| Kayonza | Hagenimana bienvenue | Bwatamama | 3101 | Sindayigaya Emmanuel |
| Rwamagana | Utamuruza Jeanne | Murehe | 3009 | Niyitegeka Venersnda |
| Rwamagana | Utamuriza Jeanne | Gatare | 2952 | Barahira Eric |
| Rwamagana | Utamuriza Jeanne | Gatare | 2944 | Uwimana Alphonsine |
| Kayonza | NA | Bara | 2850 | Gatsinzi Faustin |
| Kayonza | Dusenge Pierre | Kagumiro | 2819 | Mukandamage Dancile |
| Kayonza | hagenimana bienvenue | Kajevuba | 2841c | Mukabatabazi Donatienne |
| Rutsiro | Habimana Eugene | Rugote | 2029c | Kanakuze Sara |
| Nyaruguru | Nyiramana cloudine | Rubuga | 602c | Bivugire Andre |
| Nyaruguru | Nyiramana cloudine | Rubuga | 602c | Bivugire Andre |
| Mugonero | NA | Rugeyo | 1767 | Bazigama Jean |
| Nyamasheke | NA | Mutiti | 971c | Uzabakiriho Phiripe |
| Rwamagana | Niyigena Jean Pierre | Nyarugenge | 2874 | Mukankunsi Foromina |
| Karongi | Antoinette Uwizeye | Gashari | 1212 | Sindikubwabo Thomas |
| Mugonero | NA | Uwingabo | 1659 | Basabose Pascal |
| Nyamasheke | NA | Rugabano | 1030 | Nyabuyenga Jean |
| Nyamasheke | Hakuzimana Simeon | Bururi | 793c | Kamuzima Pascasie |
| Gatsibo_Nlwh | NA | Rwimbogo | 2685c | Ndikubwimana Ildaphonse |
| Gatsibo_Lwh | Niyidufasha nathanael | Rurana | 2540 | Kwizera Isaac |
| Nyamasheke | Constance Nyiranjamahoro | Bigabiro | 835c | Nubwende Ezira |
| Nyamasheke | Tuyisrnge Emmanuel | Misirimbo | 866c | Urayeneza Purusikiya |
| Gatsibo_Nlwh | Nyandwi Anathalie | Kagugu | 2663c | Nyirampakaniye Eurerie |
| Kayonza | NA | Kajevuba | 2835c | Nyiramuhanda Clementine |
| Gatsibo_Nlwh | Nyandwi Anathalie | Gakunyu | 2703c | Nyirabanguka Francoise |
| Gatsibo_Nlwh | Nyandwi Anathalie | Mataba | 2711 | Nizeyimana Anasthase |
| Gatsibo_Nlwh | Nyandwi Anathalie | Bidudu | 2709 | Kantarama Xaverine |
| Rutsiro | sylvere hategekimana | Cyato | 2073c | Mukamuganga Eugenie |
| Kayonza | Hagenimana bienvenue | Nyabugogo | 2778c | Kankuyo Sesile |
| Gatsibo_Nlwh | nshimiyerugira | Gakiri | 2610 | Bigirimana Jeapolo |
| Gatsibo_Nlwh | Nshimiyerugira | Akagarama 1 | 2732c | Nshimiye |
| Gatsibo_Nlwh | NA | Burembo | 2604c | Gatsinzi Alphonse |
| Gatsibo_Nlwh | NA | Mwanama | 2606 | Ntambara Enock |
| Nyamasheke | NA | Rambura | 1182c | Bazimaziki Nehemie |
| Karongi | NA | Kirehe | 1365c | Niyomugabo Tierry |
| Nyanza | ndakaza philbert | Buhaza | 319 | Zirakurwa Uziyeri |
| Karongi | Niyonsaba MJudith | Kagangare | 1524 | Ayindimwe Alexie |
| Karongi | NA | Kigarama | 1525c | Nikuze Elizabeti |
| Huye | NA | Kigarama | 145 | Buregeya Cassien |
| Rutsiro | NA | Maryazo | 2405c | Nyambuga Stephania |
| Huye | NiyonsabaMJudith | Rubona | 65c | Nyirantezimana Oliver |
| Huye | RURANGIRWA Emmanuel | Rusuma | 130 | Buriminyundo Andrea |
| Karongi | Nshimiyimana Clement | Ruhande | 1423c | Murengerankwari Sameul |
| Karongi | Nshimiyimana Clement | Nyabivumu | 1621c | Uwimana Gatarina |
| Karongi | Nshimiyimana Clement | Ryangondo | 1473 | Nyiransabimana Esipelense |
| Karongi | Nshimiyimana Clement | NA | 1476c | Nshimyukiza Emanuel |
| Karongi | NA | Muhondo | 1266 | Nyiranduhura Sarayi |
| Karongi | NA | Kivumu | 1269c | Uwitonze Emmanuel |
| Rutsiro | Venerande N Habimana | Musongati | 2373c | Bucyeyeneza Louis |
| Rutsiro | nyanzira valentine | Buruseri | 2322c | Mbatejurundiagnes |
| Rutsiro | NA | Gitete | 2321 | Musabyimana Christine |
| Karongi | NA | Nyamagana | 1293 | Nyirabanama Monigue |
| Karongi | NA | Kigabiro | 1298 | Munyandamutsa Daniel |
| Karongi | NA | Gitega | 1397 | Nyirangarukiye Eugenie |
| Karongi | Kanyabugoyi Felicien | Kavumu | 1390 | Ngarukiye Simiyoni |
Some observations have 79% of observations missing. What’s going on with these? I’m only showing rows with greater than 25% missing. It’s somewhat arbitrary but to not have data for a fourth of the survey seems odd.
#table(yt$sample_id %in% unique(fieldDat$sample_id))
Through September 14, 2017 we have one soil sample id in round 2 that doesn’t exist in previous rounds. It is sample_id 1437, 1300.
What is up with that observation?
baseComplete <- fieldDat %>%
filter(season =="15b") %>%
dplyr::select(sample_id)
compB <- round(prop.table(table(baseComplete$sample_id %in% yt$sample_id)),2)
r1Complete <- fieldDat %>%
filter(season =="16b") %>%
dplyr::select(sample_id)
compR1 <- round(prop.table(table(r1Complete$sample_id %in% yt$sample_id)),2)
Through September 14, 2017 we’ve found 99% of baseline farmers. We have 26 to go.
Through September 14, 2017 we’ve found 99% of round 1 farmers. We have 25 to go.
As we get closer to the end of enumeration I will produce a report of sample_ids that have not yet been surveyed so we can understand precisely why some farmers were not found again.
# code to check merge match before match is performed
# the code should show you how many overlaps you have in your merge variable
# and a summary of the values that don't match
# tested using ke_round_1_py
vecFiller <- function(vec){
vec[2] = ifelse(is.na(vec[2]), 0, vec[2])
return(vec)
}
mergeReport <- function(master, using){
# dat1 into dat2
c1 <- vecFiller(table(master %in% using))
c2 <- vecFiller(table(using %in% master))
notMatch1 <- master[!master %in% using]
notMatch2 <- using[!using %in% master]
tab <- data.frame(rbind(c1, c2))
rownames(tab) <- c("Master in Using", "Using in Master")
outList <- list(tab, notMatch1, notMatch2)
names(outList) <- c("matching.table", "missing.from.using", "missing.from.master")
return(
outList
)
}
attReport <- mergeReport(baseComplete$sample_id, yt$sample_id)
We currently have 26 more surveys to complete from the baseline. The complete list of missing farmer ids will be shared once we’re down to a more reasonable number.
attOut is the list of baseline farmers not found during the second survey round. attReport are those farmers that are missing
attOut <- fieldDat %>%
filter(season =="15b") %>%
filter(sample_id %in% attReport$missing.from.using) %>%
dplyr::select(sample_id, district, village)
#load the baseline data
load("../rw_round_1/rawBaselineWithIdentifers.Rdata")
attritionTab <- d %>%
dplyr::select(district, selected_cell, umudugudu, sample_id, farmer_name) %>%
mutate(sample_id = tolower(sample_id), # because we're makring those not found
found = ifelse(sample_id %in% attOut$sample_id, FALSE, TRUE)) %>%
left_join(., ytwo[, c("x18a.farmer.name", "x18b.name.of.the.respondent", "x19a.farmer.phone.number", "x.form.sample_id",
"x20.in.case.we.can.not.reach.the.client.directly..please.provide.a.neighbor.s.phone.number",
"x14a.were..you.a.tubura.client.17a.", "x14b.are.you.a.tubura.client.17b.","x14c.are.you.a.tubura.client.in.18a.")], by=c("sample_id"= "x.form.sample_id"))
attritionTab <- attritionTab[,c(4,1,2,3,5,6, 7:13)]
#length(attritionTab$sample_id)==length(unique(attritionTab$sample_id))
# load data from Eric J on 9/11 - combine my missingn with those without a sample and update the googlesheet
ejDat1 <- read_xlsx("attrition table2017 09 11_EJ.xlsx", sheet=1) %>%
dplyr::select(sample_id, `sample collected (from tracker)?`, `reason (from tracker)`) %>%
rename(sample_collected = `sample collected (from tracker)?`,
reason = `reason (from tracker)`)
ejDat2 <- read_xlsx("attrition table2017 09 11_EJ.xlsx", sheet=2) %>%
dplyr::select(sample_id, `sample collected?`, reason) %>%
rename(sample_collected = `sample collected?`) %>%
rbind(., ejDat1)
#length(ejDat2$sample_id)==length(unique(ejDat$sample_id)) # duplicates in the missing data
# report missing values in the duplicates
dups <- ejDat2[duplicated(ejDat2$sample_id), "sample_id"] %>%
filter(!is.na(sample_id))
# show all the farmers for which we have multiple reasons listed:
# ejDat2 %>%
# filter(ejDat2$sample_id %in% dups$sample_id) %>%
# arrange(sample_id) %>%
# kable(., format='markdown')
Eric, there isn’t clarity in Jeannette’s form about what has and has not been collected. Can she make a master database instead of two separate ones? Add a second column if you want to indicate the issue. For the time being I’m not going to add this into the rwR2Attrition google sheet until we have a definitive answer on what the status is.
# now put this information with the attrition Tab
#attritionTab <- left_join(attritionTab, ejDat2, by="sample_id")
write.csv(attritionTab, file=paste0("attritionTabs/attrition table", format(Sys.time(), '%Y %m %d'), ".csv"), row.names=F)
# also update a googlesheet to make collaborating on this possible:
library(googlesheets)
package ‘googlesheets’ was built under R version 3.3.2
# first uploading of the data
# test <- gs_upload(file=paste0("attritionTabs/attrition table", format(Sys.time(), '%Y %m %d'), ".csv"), "rwR2Attrition", overwrite=T)
# now I want to only edit certain rows:
# test <- test %>%
# gs_edit_cells(input = attritionTab$found,
# anchor = "F2", byrow=FALSE)
#
# # bring in info from M&E team, merge reasons with current table and only show the farmers for which we are still missing an explanation
#
# test <- read_xlsx("Farmer not find for SHS survey 7.9.2017.xlsx", sheet=2) %>%
# rename(sample_id = `Soil ID`) %>%
# mutate(sample_id = tolower(sample_id))
# table(test$sample_id %in% attritionTab$sample_id) # do they appear among the missing?
# table(test$sample_id %in% subset(fieldDat, fieldDat$season=="15b")$sample_id) # they are farmers but I've found them
# dplyr::select(`Reason not found`, `Reason not found other`, District, Cell, Village, `Cell field`, `Respondent name`, `Farmer tel`, `Respondent tel`, `Neighbor tel`, sample_id) %>%
# left_join(., attritionTab, by="sample_id") %>%
# as.data.frame()
# find the farmer name in the original baseline data?
datatable(attritionTab)
The variable names are not great. I think it’s due to the CommCare API export. I’m going to see if there’s an easier way to have uesable variable names. I’ve creaetd a CC export titled “M&E - Soil Health Study - 17B Ubutaka (Soil)” on August 8th. I’m only using it to get more usable variable names quickly. Ideally there’d be away to get that info directly from CommCare.
varNames <- read_csv("M&E - Soil Health Study - 17B Ubutaka (Soil) (2017-08-04) 2017-08-04.csv") %>%
setNames(gsub("form.", "", names(.))) %>%
setNames(gsub("farmer.", "", names(.))) %>%
setNames(gsub("prim_17a_.", "", names(.))) %>%
setNames(gsub("sec_17a_.", "", names(.))) %>%
setNames(gsub("prim_17b_.", "", names(.))) %>%
setNames(gsub("sec_17b_.", "", names(.))) %>%
setNames(gsub("ammend_17a_.", "", names(.))) %>%
setNames(gsub("ammend_17b_.", "", names(.))) %>%
as.data.frame()
Parsed with column specification:
cols(
.default = col_character(),
number = col_integer(),
form.farmer_found = col_integer(),
completed_time = col_datetime(format = ""),
started_time = col_datetime(format = ""),
received_on = col_datetime(format = "")
)
See spec(...) for full column specifications.
names(yt)[11:112] <- names(varNames)[3:104]
# rename one variable to remove duplicates
names(yt)[31] <- "nameOne"
names(yt) <- tolower(names(yt))
# manually fix a couple other names
names(yt)[53] <- "crop1_yield_comparison_17a"
names(yt)[58] <- "crop2_yield_comparison_17a"
catVars <- names(yt)[sapply(yt, function(x){
is.character(x)
})]
enumClean <- function(dat, x, toRemove){
dat[,x] <- ifelse(dat[,x] %in% toRemove, NA, dat[,x])
return(dat[,x])
}
strTable <- function(dat, x){
varName = x
tab = as.data.frame(table(dat[,x], useNA = 'ifany'))
tab = tab[order(tab$Freq, decreasing = T),]
end = ifelse(length(tab$Var1)<10, length(tab$Var1), 10)
repOrder = paste(tab$Var1[1:end], collapse=", ")
out = data.frame(variable = varName,
responses = repOrder)
return(out)
}
# clean up known values
catEnumVals <- c("-99", "-88", "- 99", "-99.0", "88", "_88", "- 88", "0.88",
"--88", "__88", "-88.0", "99.0")
yt[,catVars] <- sapply(catVars, function(y){
yt[,y] <- enumClean(yt,y, catEnumVals)
})
# put all categorical values in lower
yt[,catVars] <- sapply(yt[,catVars], function(x) tolower(x))
responseTable <- do.call(rbind, lapply(catVars, function(x){
strTable(yt, x)
}))
kable(responseTable, format="markdown")
| variable | responses |
|---|---|
| appformid | ../../data/commcare_cache/98d142ec1149d0098f4e72ce0a35706 |
| id | 00072a70-bd5f-4291-a81d-17840b4884b5, 000ea6f0-2fb7-45ed-b0b0-4bff8d6096c5, 00162ffc-1c98-485a-be46-f8783e80647b, 00b3e2f5-c53e-4ff5-90f8-db734cd1b662, 00d89c38-ac7d-4b75-9d9b-947844f0a8a8, 00f7ff4c-84cb-43cb-aacc-d5392278e936, 0103734f-0b30-4bee-a6ca-172f98772cce, 01270485-942e-47c7-9d23-5233f1cd0f52, 014ef092-98a2-4ecb-9194-ba04e3255bd2, 0159c49a-5a6c-402f-9f09-d56972aba90c |
| domain | oafrwanda |
| date_header | NA |
| metadata.deviceid | 358310068624197, 358310068699280, 357103076500702, 357103076505578, 358310068599233, 358310068599506, 358310068599761, 358310068699157, 357103076496752, 358310068263228 |
| metadata.userid | 0cb6454736db554904dbdd8a32fd2572, 7954ad9145fd68242150fb3a7d001dec, 0cb6454736db554904dbdd8a32fd17b4, 0cb6454736db554904dbdd8a32fb90ec, 0cb6454736db554904dbdd8a32fe521f, 0cb6454736db554904dbdd8a32fe3e2f, 0cb6454736db554904dbdd8a32fe498a, 0cb6454736db554904dbdd8a32fe5083, 408e98c5bd832100d1e530d66a659f2c, 408e98c5bd832100d1e530d66ac43fea |
| metadata.username | nym2, gat1, nym3, rut6, rwm1, kyz1, kyz2, rwm2, rwm3, rwm4 |
| form.case..case_id | 4968796dfe4e4f8990794c0373faa06e, 4aed5f40760f41b4b798a265c16db675, 62a5ef2a6aae4ac4919bdf5d78d94709, 98922c8fae5e43c58c6502e6ff6f33df, aa8fe94224cf443fb9281d674ee913fa, ccf9cb15dad44273b89dc8b979a2c878, f704b47c73a14135a16298c68ed4d74c, 00562917ad364da18ac1f673415ed116, 0073c4c117924c0fae4f231b0482f78c, 00c9941f634546b5be6dccd93bbf6d4f |
| start_time | 12:00:00.000+02, 11:30:00.000+02, 10:30:00.000+02, 10:00:00.000+02, 09:00:00.000+02, 08:30:00.000+02, 13:06:00.000+02, 08:18:00.000+02, 09:20:00.000+02, 10:06:00.000+02 |
| enumerator | ntabudakeba olive, niyidufasha nathanael, hagenimana bienvenue, murekatete alphonsine, niyigena jean pierre, nyirampano bernadette, zimukwiye dominique, utamuriza jeanne, nyanzira valentine, nshimiyerugira |
| not_find_why | NA, other, moved, deceased, notfarm |
| not_find_why_other | NA, ntakiwuhinga, umurima we wafashwe na leta kubera umuhanda, bahahaye undi muvandi we, bahatanze i minani uwahahingaga ntiyo ngera kuhahinga uwobahahaye naramubuze, bahateye ishyamba, baragabanye nk umuryango niyongera kuhahinga, mubirigi eliel yarawugurishije uwuhinga ni racher ahogeze, ntakihahinga barahamwatse, ubwambere nkorera uy umuhinzi namusanze ku mu murima aho yahingaga,nsubiye yo bwa kabiri nsanga hari undi muhinzi nari nasanze mu rugo nsanga ari umugore we mbibwira manager ambwirako ari ukumureka no ubutaka bwafashwe igihembwe cya 16 b ntiyafatiwe ubutaka |
| soil_id | NA, 1646, 1676, 2333c, 2885, 978c, 10, 1001, 1001c, 1003 |
| soil_id2_ | NA, 1646, 1676, 2333c, 2885, 978c, 10, 1001, 1001c, 1003 |
| text_final.photo | NA, 1501036672690.jpg, 1501040329114.jpg, 1501052471877.jpg, 1501052777690.jpg, 1501053428782.jpg, 1501053613964.jpg, 1501053734690.jpg, 1501053901444.jpg, 1501053976830.jpg |
| district_17b | karongi, rutsiro, huye, rwamagana, nyamasheke, gatsibo, nym, nyamagabe, kayonza, gisagara |
| cell_input_17b | NA, kabona, rubumba, kagarama ka, muhororo, marimba a, butiruka, kibyagira a, mubuga, mutongoca |
| cell_field_17b | NA, rubumba, mubuga, kagarama, nyabicwamba, kabona, gisoke, kibyagira, nyagatare, birambo |
| village_17b | NA, kabeza, kigarama, murambi, karambo, kabuga, rugarama, kivumu, gasharu, gatare |
| nameone | NA, habimana emmanuel, hakizimana fidele, hakizimana joel, hategekimana francois, havugimana celestin, karekezi celestin, mujawamariya beatrice, mukeshimana claudine, nkundimana j baptiste |
| name_respondent | NA, habimana emmanuel, habimana celestin, karekezi celestin, kayumba josephine, mujawamariya beatrice, mukangiriye donatha, mukankusi beatrice, munyensanga emmanuel, musabyimana beatrice |
| tel | NA, 0, _99, ntayo, --99, 99, -9, 07, 0722007421, 0722014089 |
| tel_respondent | NA, 0, _99, 99, ntayo, --99, -9, 0727005638, 0727121920, 078 |
| neighbor_tel | NA, -8, 0, 0787393574, _99, 0722475878, 0727911362, 0781143339, 0782897631, 0782936691 |
| gender | male, female, NA |
| respondent_gender | female, male, NA |
| crop1_17a | gor, big, shy, not_plant_17a, yum, jum, soya, NA, ray, insina |
| seed_maize_crop1_17a | NA, new_hybrid, gor_nsp, opv_saved, hybride_saved, opv_new |
| crop2_17a | no_other_plant_17a, yum, gor, NA, insina, big, jum, ray, shaz, soya |
| seed_maize_crop2_17b | NA, gor_nsp, opv_saved, hybride, hybride_saved, opv_new |
| fert1_17a | none, dap, NA, npk-17, urea, npk-22, npk2555 |
| fert2_17a | none, urea, NA, dap, npk-17 |
| compost_qual_17a | good, average, NA, bad |
| compost_type_17a | cow, NA, goat, pig, kitchen_waste, plant, other, human, chicken |
| compost_other_17a | NA, intama, iy'inka n'ihene, compost, composite, inka n'ihene, composte, ibyatsi bagiye bahira, ibyatsi byinshi yavanzemo amase yinka, ifumbire ya wese |
| lime_17a | no_lime, NA, lime_outside, lime_tubura, both_tubura_non_tubura |
| crop1_17b | big, shy, saka, jum, not_plant_17b, gor, yum, soya, ray, nyo |
| seed_maize_crop1_17b | NA, new_hybrid, opv_new, gor_nsp, opv_saved, hybride_saved |
| crop2_17b | not_plant_crop2_17b, yum, NA, gor, insina, jum, big, not_plant_crop217b, soya, shaz |
| fert1_17b | none, dap, NA, npk-17, urea, npk-22, npk2555 |
| fert2_17b | NA, none, urea, dap, npk-17, npk-22, npk2555 |
| compost_qual_17b | NA, average, good, bad |
| compost_type_17b | NA, cow, goat, pig, kitchen_waste, plant, other, human, chicken |
| compost_other_17b | NA |
| lime_17b | no_lime, NA, lime_outside, lime_tubura, both_tubura_non_tubura |
| crop_residues | feed_animals, mulching, leave_field, compost_use, burn_field, NA, other, sell, burn_discard |
| anti_erosion_efforts | drainageditch, nothing, radicalterrace, gradualterrace, NA |
| d_plant_rows_slope_17b | na, across_slope, NA, down_slope |
| other_comments | ntazo, ntakibazo, ntayo, NA, 0, no comment, ntacyahindutse, ntacyahindutse., ntazo., noncomments |
| finish_time | 10:30:00.000+02, 08:30:00.000+02, 10:40:00.000+02, 11:30:00.000+02, 09:45:00.000+02, 10:50:00.000+02, 13:27:00.000+02, 13:50:00.000+02, 15:00:00.000+02, 08:50:00.000+02 |
| description | NA, igiti, ku musozi, iruhande rw'inzira, haruguru y'inzira, hejuru yinzira, insina, munsi y,inzira, ukikijwe nurubingo, intangiro hari inzira |
| neighbor_or_voisin_phone | NA |
| district | rutsiro, karongi, nyamasheke, mugonero, huye, rwamagana, gatsibo_nlwh, gatsibo_lwh, kayonza, nyamagabe |
| enumerator_in_16b | NA, hagenimana bienvenue, mucyowimihigo j mv, nyandwi anathalie, utamuriza jeanne, zimukwiye dominique, niyidufasha nathanael, nyirangirimana jeanne, nshimiyerugira, rukundo japhet |
| cell | rubumba, kabona, muhororo, nyabicwamba a, gitara b, kibyagira a, cyarubare a, gatsibo a, gitara a, gitara c |
| respondent_in_16b | NA, habimana emmanuel, akimana jeannette, benimana domithira, bimenyande djumapri, hagumagatsi gaspard, karekezi celestin, kayiranga j bosco, mujawamariya beatrice, mukabinyange cecile |
| village | kigarama, kabeza, murambi, kabuga, gasharu, karambo, rugarama, kivumu, gatare, remera |
| sample_id | 1264, 2632, 2875c, 2938, 315, 60, 602c, 10, 1001, 1001c |
| client_in_15b | yego, oya, NA |
| client_in_16b_or_17a | NA |
| gps_hidden | NA, -2.1754531 29.2847899 0.0 4138.0, -2.1774711 29.2884729 0.0 4970.0, -2.0294828 30.6384099 0.0 2815.0, -2.0564286 29.3996455 0.0 52.4, -1.593719 30.2564566 0.0 3647.0, -1.8408223 29.3231408 0.0 4321.0, -2.0392077 30.6286476 0.0 5000.0, -2.1717515 30.5800986 0.0 3840.0, -2.4216136 29.2036459 0.0 4227.0 |
| farmer_phone | ., 782924160, 722063424, 722354432, 782475072, 782901568, 782925056, 782929152, 782936320, 782936704 |
| enumerator_in_15b | mucyowimihigo j mv, rukundo japhet, dusenge pierre, hagenimana bienvenue, nshimiyerugira, nyirangirimana jeanne, niyidufasha nathanael, torero pacifique, zimukwiye dominique, ngomituje judith |
| respondent_phone | ., 722063424, 723013760, 726201472, 727511488, 728496832, 782708544, 782906112, 782929280, 782929344 |
| name | habimana emmanuel, 11kayiranga jean bosco, bajeneza jd'amaur, benimana domithira, bivugire andre, bugabo jean, hakizimana fidele, hategekimana francois, havugimana celestin, iyamurenye amoni |
repGraphs <- function(dat, x){
tab = as.data.frame(table(dat[,x], useNA = 'ifany'))
tab = tab[order(tab$Freq, decreasing = T),]
print(
ggplot(data=tab, aes(x=Var1, y=Freq)) + geom_bar(stat="identity") +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title =paste0("Composition of variable: ", x))
)
}
adminVars <- tolower(c(names(yt)[grep("meta", names(yt))],"enum_name", "text_final.photo", "participation", "refusal", "phone", "comment", "gps", "sample_id", "sampling.barcode", "id", "domain", "date_header", "form.case..case_id", "site", "district1", "site1", "plot.location", "New_soil_sample_id", "#form/Sampling2017_Complete", "appformid",names(yt)[grep("soil_id", names(yt))], "nameOne", "tel", "tel_respondent", "neighbor_tel", "finish_time", "other_comments", "Description", "neighbor_or_voisin_phone", "name", "cell_field_17b", "name_respondent","respondent_in_16b","gps_hidden", "farmer_phone", "respondent_phone", "village", "cell", names(yt)[grep("enumerator", names(yt))], "start_time", "village_17b", "cell_input_17b", "not_find_why_other", "compost_other_17a"))
nonAdminVars <- catVars[!catVars %in% adminVars]
for(i in 1:length(nonAdminVars)){
repGraphs(yt, nonAdminVars[i])
}
numVars <- names(yt)[sapply(yt, function(x){
is.numeric(x)
})]
Basic cleaning of known issues like enumerator codes for DK, NWR, etc.
enumVals <- c(-88,-85, -99, -9)
yt[,numVars] <- sapply(numVars, function(y){
yt[,y] <- enumClean(yt,y, enumVals)
yt[,y] <- as.numeric(yt[,y])
})
NAs introduced by coercion
temp <- numVars
noNeed <- c("intro.intro", "text_final.d_photo", "text_final.d_soilsample")
numVars <- numVars[!numVars %in% noNeed]
# remove variables that are entirely missing
totallyMissing <- sapply(yt[,numVars], function(x){
all(is.na(x))
})
numVars <- numVars[totallyMissing==F]
for(i in 1:length(numVars)){
if(length(unique(yt[,numVars[i]]))>10){
print(ggplot(data=yt, aes(x=yt[,numVars[i]])) +
geom_density() +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = numVars[i])
)
} else{
print(ggplot(data=yt, aes(x=yt[,numVars[i]])) +
geom_histogram(stat="count") +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = numVars[i])
)
}
#multiplot(temp1, temp2, cols = 2
}
Ignoring unknown parameters: binwidth, bins, pad
detectOutlier <- function(dat, x){
# function checks distribution and if lognormal, coverts and outputs
# outliers from log normal distribution. have to remove 0s to check
fits <- list(
no = fitdistr(dat[,x][complete.cases(dat[,x])],"normal"),
lo = fitdistr((dat[,x])[complete.cases(dat[,x]) & dat[,x]!=0 & dat[,x]>0],"log-normal")
#ca = fitdistr(dat[,x][complete.cases(dat[,x])],"cauchy"),
#we = fitdistr(dat[,x][complete.cases(dat[,x])], "weibull")
)
# get the logliks for each model...
fitCheck <- sapply(fits, function(i) i$loglik)
bestFit <-names(which.max(fitCheck))
trans <- sapply(dat[,x], function(y){
ifelse(bestFit=="lo", log10(y+1), y)
})
q1 = summary(trans)[[2]]
q3 = summary(trans)[[5]]
iqr = q3-q1
mark = ifelse(trans < (q1 - (1.5*iqr)) | trans > (q3 + (1.5*iqr)), 1,0)
# tab = rbind(
# summary(d[,x]),
# summary(d[mark==0, x])
# )
out = dat[mark==1, c("district", "cell", "enumerator", "sample_id", "farmer_phone", x)]
out = melt(out, measure.vars = x)
out = out[!is.na(out$district),]
return(out)
}
printIQR <- do.call(rbind, lapply(numVars, function(y){
#print(y)
return(detectOutlier(yt, y))
}))
NaNs producedNaNs producedNaNs producedNaNs producedNaNs produced
printIQR <- printIQR[order(printIQR$enumerator,printIQR$cell, printIQR$sample_id),]
Compost application and quantity
This appears to be odded coded. Some “I didn’t use compost” are 0s while others are NA.
table(yt$kg_compost_17a, yt$compost_17a, useNA = 'ifany')[c(1:10),]
0 1 <NA>
0 0 1 0
5 0 1 0
6 0 1 0
10 0 3 0
12.5 0 1 0
15 0 9 0
16 0 1 0
20 0 19 0
24 0 2 0
25 0 8 0
modifyOutlierOutput <- function(dat, varname, vals){
dat = dplyr::filter(dat, variable!=varname & (!value %in% vals))
return(dat)
}
printIQR <- modifyOutlierOutput(printIQR, "found", 1)
printIQR <- modifyOutlierOutput(printIQR, "compost", 1)
printIQR <- modifyOutlierOutput(printIQR, "compost_17a", 0)
printIQR <- modifyOutlierOutput(printIQR, "n_chickens", 2:7)
Read the values in this table with the above histograms and density plots in mind. Not all the values listed in the table will be impossible values but they were flagged for being far from the central mass of values we have on that data point
datatable(printIQR)
write.csv(printIQR, file=paste0("output/", "rw_shs_r2 vars to check update",format(Sys.time(), '%B %d %Y'), ".csv"), row.names = F)
for(i in 1:length(unique(printIQR$enumerator))){
outSheet <- printIQR[printIQR$enumerator==unique(printIQR$enumerator)[i], ]
write.csv(outSheet, file=paste("output",
paste("rw_shs_r2 vars to check ", unique(printIQR$enumerator)[i], ".csv", sep = ""), sep = "/"))
}